
* prepare updates content
use "content of updates/content of updates.dta", clear

split date, p("-")
destring date1, gen(year)
destring date2, gen(month)
destring date3, gen(day)
drop date
gen date = mdy(month, day, year)

destring updates_lexdiv, replace
destring updates_read, replace
destring updates_entropy, replace
destring updates_words, replace

* get mean variable when a campaigns posts multiple updates on a given day
collapse (mean) updates_lexdiv updates_read updates_entropy updates_words, by(date url)

save "content_temp.dta", replace


* compute backers' topic distinctiveness
use "comments\topic proportions.dta", clear

collapse (mean) V*, by(date url)

gen hhi = 100*(V1^2 + V2^2 + V3^2 + V4^2 + V5^2 + V6^2 + V7^2 + V8^2 + V9^2 + V10^2 + ///
               V11^2 + V12^2 + V13^2 + V14^2 + V15^2 + V16^2 + V17^2 + V18^2 + V19^2 + V10^2 + ///
			   V21^2 + V22^2 + V23^2 + V24^2 + V25^2 + V26^2 + V27^2 + V28^2 + V29^2 + V30^2 + ///
			   V31^2 + V32^2 + V33^2 + V34^2 + V35^2 + V36^2 + V37^2 + V38^2 + V39^2 + V40^2 + ///
			   V41^2 + V42^2 + V43^2 + V44^2 + V45^2 + V46^2 + V47^2 + V48^2 + V49^2 + V50^2)

drop V*
save "hhi_temp.dta", replace



* prepare uncertainty / sentiment based on RoBERTa
import delimited "comments\based on transformers\predictions.csv", bindquote(strict) maxquotedrows(unlimited) clear

* restrict to ordinary backers
keep if authorbadges == "" | authorbadges == "superbacker"

split time, p(" ")
split time1, p("-")
destring time11, gen(day)
destring time13, gen(year)
gen month = 1
replace month = 2 if time12 == "February"
replace month = 3 if time12 == "March"
replace month = 4 if time12 == "April"
replace month = 5 if time12 == "May"
replace month = 6 if time12 == "June"
replace month = 7 if time12 == "July"
replace month = 8 if time12 == "August"
replace month = 9 if time12 == "September"
replace month = 10 if time12 == "October"
replace month = 11 if time12 == "November"
replace month = 12 if time12 == "December"
gen date = mdy(month, day, year)

gen total_roberta = wordcount(sentence)

gen pos_roberta = 0
replace pos_roberta = total_roberta if label_sentiment == "positive"
gen neg_roberta = 0
replace neg_roberta = total_roberta if label_sentiment == "negative"
gen uncertain_roberta = 0
replace uncertain_roberta = total_roberta if label_uncertainty == "LABEL_1"

collapse (sum) uncertain_roberta pos_roberta neg_roberta total_roberta, by(date projecturl)

split projecturl, p("?ref=kicktraq")
drop projecturl
rename projecturl1 url

save "roberta_temp.dta", replace



* prep Updates
import excel "updates.xls", sheet("Sheet1") firstrow clear
drop running_id
save "updates_temp.dta", replace


* prep funding data
import excel "daily funding.xls", sheet("Sheet1") firstrow clear
rename id running_id
destring daily_pledge, replace
drop if missing(daily_pledge)
save "funding_temp.dta", replace


* get Kicktraq links
import delimited "links.txt", clear

rename v1 running_id
rename v2 url

split url, p("/")

rename url6 slug
drop url*

* merge with webrobots data
merge 1:m slug using "webrobots\projects info.dta"
drop if _merge == 2
drop _merge state creator

generate double launched_at2 = launched_at*1000 + mdyhms(1,1,1970,0,0,0)
drop launched_at
gen launched_at=dofc(launched_at2)
format launched_at %td
drop launched_at2

sort running_id state_changed_at fx_rate
duplicates drop running_id country currency launched_at, force
drop state_changed_at created_at deadline

* manually add info on projects missing in webrobots data
replace country = "US" if slug == "the-bone"
replace currency = "USD" if slug == "the-bone"
replace fx_rate = 1 if slug == "the-bone"
replace id = 10000001 if slug == "the-bone"
replace launched_at = td(29.06.2021) if slug == "the-bone"
replace name = "The Bone: Get superpowers in real life!" if slug == "the-bone"
replace spotlight = "false" if slug == "the-bone"
replace staff_pick = "false" if slug == "the-bone"
replace goal = 50000 if slug == "the-bone"

replace country = "CA" if slug == "blue-june"
replace currency = "CAD" if slug == "blue-june"
replace fx_rate = 0.8145591 if slug == "blue-june"
replace id = 10000002 if slug == "blue-june"
replace launched_at = td(22.06.2021) if slug == "blue-june"
replace name = "Blue June" if slug == "blue-june"
replace spotlight = "false" if slug == "blue-june"
replace staff_pick = "true" if slug == "blue-june"
replace goal = 30000 if slug == "blue-june"

replace country = "AU" if slug == "brusniks-long-way-home-a-woke-satire-video-game"
replace currency = "AUD" if slug == "brusniks-long-way-home-a-woke-satire-video-game"
replace fx_rate = 0.7308184 if slug == "brusniks-long-way-home-a-woke-satire-video-game"
replace id = 10000003 if slug == "brusniks-long-way-home-a-woke-satire-video-game"
replace launched_at = td(20.06.2021) if slug == "brusniks-long-way-home-a-woke-satire-video-game"
replace name = "90's Styled Video Game: Brusnik's Long Way Home" if slug == "brusniks-long-way-home-a-woke-satire-video-game"
replace spotlight = "false" if slug == "brusniks-long-way-home-a-woke-satire-video-game"
replace staff_pick = "false" if slug == "brusniks-long-way-home-a-woke-satire-video-game"
replace goal = 30000 if slug == "brusniks-long-way-home-a-woke-satire-video-game"

replace country = "US" if slug == "cloudscape"
replace currency = "USD" if slug == "cloudscape"
replace fx_rate = 1 if slug == "cloudscape"
replace id = 10000004 if slug == "cloudscape"
replace launched_at = td(22.06.2021) if slug == "cloudscape"
replace name = "Cloudscape" if slug == "cloudscape"
replace spotlight = "false" if slug == "cloudscape"
replace staff_pick = "true" if slug == "cloudscape"
replace goal = 50000 if slug == "cloudscape"

replace country = "US" if slug == "illuminati-confirmed"
replace currency = "USD" if slug == "illuminati-confirmed"
replace fx_rate = 1 if slug == "illuminati-confirmed"
replace id = 10000005 if slug == "illuminati-confirmed"
replace launched_at = td(30.11.2020) if slug == "illuminati-confirmed"
replace name = "ILLUMINATI CONFIRMED" if slug == "illuminati-confirmed"
replace spotlight = "false" if slug == "illuminati-confirmed"
replace staff_pick = "true" if slug == "illuminati-confirmed"
replace goal = 210000 if slug == "illuminati-confirmed"

replace country = "GB" if slug == "occupy-white-walls"
replace currency = "GBP" if slug == "occupy-white-walls"
replace fx_rate = 1.295882 if slug == "occupy-white-walls"
replace id = 10000006 if slug == "occupy-white-walls"
replace launched_at = td(17.03.2020) if slug == "occupy-white-walls"
replace name = "Occupy White Walls: Art, Architecture and AI MMO" if slug == "occupy-white-walls"
replace spotlight = "false" if slug == "occupy-white-walls"
replace staff_pick = "true" if slug == "occupy-white-walls"
replace goal = 100000 if slug == "occupy-white-walls"
replace name = "" if slug == "occupy-white-walls"

replace country = "IL" if slug == "plot-of-the-druid"
replace currency = "USD" if slug == "plot-of-the-druid"
replace fx_rate = 1 if slug == "plot-of-the-druid"
replace id = 10000007 if slug == "plot-of-the-druid"
replace launched_at = td(22.06.2021) if slug == "plot-of-the-druid"
replace name = "Plot of the Druid" if slug == "plot-of-the-druid"
replace spotlight = "false" if slug == "plot-of-the-druid"
replace staff_pick = "false" if slug == "plot-of-the-druid"
replace goal = 25000 if slug == "plot-of-the-druid"

replace country = "JP" if slug == "war-of-ashird-a-strategic-turn-based-jrpg"
replace currency = "JPY" if slug == "war-of-ashird-a-strategic-turn-based-jrpg"
replace fx_rate = 0.0093578 if slug == "war-of-ashird-a-strategic-turn-based-jrpg"
replace id = 10000008 if slug == "war-of-ashird-a-strategic-turn-based-jrpg"
replace launched_at = td(22.04.2020) if slug == "war-of-ashird-a-strategic-turn-based-jrpg"
replace name = "War of Ashird: A Strategic Turn-Based JRPG" if slug == "war-of-ashird-a-strategic-turn-based-jrpg"
replace spotlight = "false" if slug == "war-of-ashird-a-strategic-turn-based-jrpg"
replace staff_pick = "false" if slug == "war-of-ashird-a-strategic-turn-based-jrpg"
replace goal = 82000 if slug == "war-of-ashird-a-strategic-turn-based-jrpg"

* correct one mistake
replace goal = 30000 if slug == "desolatium-new-graphic-adventure-based-on-lovecraft-mythos"

* merge with funding data
merge 1:m running_id using "funding_temp.dta"
drop _merge
erase "funding_temp.dta"

* convert currencies to USD (funding goal is already in USD)
gen daily_usd = daily_pledge*fx_rate

replace url = substr(url, 2, .)
gen date = launched_at + day - 1
format date %td


* merge with topic heterogeneity
merge 1:1 date url using "hhi_temp.dta"
replace hhi = 100 if missing(hhi)
drop if _merge == 2
drop _merge
erase hhi_temp.dta

* merge with roberta measures
merge 1:1 date url using "roberta_temp.dta"
foreach var in uncertain_roberta pos_roberta neg_roberta total_roberta {
	replace `var' = 0 if _merge == 1
	}
drop if _merge == 2
drop _merge
erase roberta_temp.dta

* merge with bag-of-words sentiment
merge 1:1 date url using "comments/comments sentiment ordinary.dta"

* replace sentiment/words with zero in days without comments
foreach var in total_words_ordinary neg_ordinary pos_ordinary {
	replace `var' = 0 if _merge == 1
	}
drop if _merge == 2
drop _merge


gen region = 1
replace region = 2 if country == "BE" | country == "CH" | country == "DE" ///
 | country == "DK" | country == "ES" | country == "FR" | country == "GB" ///
 | country == "IL" | country == "NO" | country == "PL" | country == "SE"

replace region = 3 if country == "AU" | country == "HK" | country == "JP" | country == "SG"

* drop one campaign where comments are mostly in German
drop if slug == "anstoss-2022"


* merge updates
merge 1:1 url date using "updates_temp.dta"
drop _merge
erase "updates_temp.dta"

tsset running_id day

bysort running_id (day): gen cum_usd = sum(daily_usd)
bysort running_id (day): gen cum_pledge = sum(daily_pledge)

gen goal_reached = 0
replace goal_reached = 1 if cum_usd >= goal

bysort running_id (day): gen cum_update = sum(update)
replace update = 1 if update > 0
bysort running_id (day): gen cum_update_bin = sum(update)


* merge bag-of-words uncertainty
merge 1:1 url date using "comments/comments uncertainty ordinary.dta"
drop if _merge == 2
replace uncertainty = 0 if missing(uncertainty)
drop number_of_words _merge


* merge with updates content
merge 1:1 date url using "content_temp.dta"
drop if _merge == 2
foreach var of varlist updates_lexdiv updates_read updates_entropy updates_words {
	replace `var' = . if update == 0
}

drop _merge
erase "content_temp.dta"



*********** milestones
gen round_amount = round(cum_pledge)
tostring round_amount, gen(round_amount_string)
gen digits = strlen(round_amount_string)

gen round_first = .
replace round_first = floor(cum_pledge/10) if digits == 2
replace round_first = floor(cum_pledge/100) if digits == 3
replace round_first = floor(cum_pledge/1000) if digits == 4
replace round_first = floor(cum_pledge/10000) if digits == 5
replace round_first = floor(cum_pledge/100000) if digits == 6
replace round_first = floor(cum_pledge/1000000) if digits == 7
replace round_first = floor(cum_pledge/10000000) if digits == 8
replace round_first = floor(cum_pledge/100000000) if digits == 9

gen round_sec = .
replace round_sec = floor(cum_pledge/10) if digits == 3
replace round_sec = floor(cum_pledge/100) if digits == 4
replace round_sec = floor(cum_pledge/1000) if digits == 5
replace round_sec = floor(cum_pledge/10000) if digits == 6
replace round_sec = floor(cum_pledge/100000) if digits == 7
replace round_sec = floor(cum_pledge/1000000) if digits == 8
replace round_sec = floor(cum_pledge/10000000) if digits == 9

tostring round_sec, gen(sec)
replace sec = substr(sec,1,1)
destring sec, replace

sort running_id day
gen diff_first = round_first - l1.round_first
gen diff_sec = round_sec - l1.round_sec

gen mile0 = 0
replace mile0 = 1 if diff_first != 0 & !missing(diff_first) &  daily_pledge > 0

keep running_id name url date day region daily_pledge fx_rate daily_usd ///
 update updates_words updates_read updates_lexdiv ///
 pos_roberta neg_roberta uncertain_roberta total_roberta ///
 pos_ordinary neg_ordinary uncertainty total_words_ordinary ///
 hhi goal_reached mile0
 
order running_id name url date day region daily_pledge fx_rate daily_usd ///
 update updates_words updates_read updates_lexdiv ///
 pos_roberta neg_roberta uncertain_roberta total_roberta ///
 pos_ordinary neg_ordinary uncertainty total_words_ordinary ///
 hhi goal_reached mile0
 
label variable running_id "ID"
label variable name "Project name"
label variable url "Project URL"
label variable date "Date"
label variable day "Day relative to campaign start"
label variable daily_pledge "Daily funding in native currency"
label variable fx_rate "Exchange rate"
label variable daily_usd "Daily funding in USD"
label variable region "Region"
label variable update "Update yes/no"
label variable updates_words "Update text: number of words"
label variable updates_read "Update text: Flesch reading ease score"
label variable updates_lexdiv "Update text: lexical diversity"
label variable pos_roberta "Positive sentiment score"
label variable neg_roberta "Negative sentiment score"
label variable uncertain_roberta "Unertainty score"
label variable total_roberta "Total number of words"
label variable pos_ordinary "Positive sentiment score (bag-of-words)"
label variable neg_ordinary "Negative sentiment score (bag-of-words)"
label variable uncertainty "Unertainty score (bag-of-words)"
label variable total_words_ordinary "Total number of words (bag-of-words)"
label variable hhi "Homogeneity of topics (HHI)"
label variable goal_reached "Funding goal reached yes/no"
label variable mile0 "Funding milestone yes/no"

save "campaign day dataset", replace

















